{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import featuretools as ft\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "num_rows = 10000\n",
    "int_cols = 10\n",
    "float_cols = 10\n",
    "id_columns = 4\n",
    "num_groups = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    es = ft.read_entityset(\"groupby_benchmark/\")\n",
    "except:\n",
    "    print(\"creating entityset\")\n",
    "    int_df = pd.DataFrame(np.random.randint(0,100,size=(num_rows, int_cols))).add_suffix(\"_int\")\n",
    "    float_df = pd.DataFrame(np.random.randint(0,100000,size=(num_rows, float_cols)) / 100).add_suffix(\"_float\")\n",
    "    id_df = pd.DataFrame(np.random.randint(0, num_groups,size=(num_rows, id_columns)), dtype=str).add_suffix(\"_id\")\n",
    "    df = pd.concat([int_df, float_df, id_df], axis=1)\n",
    "\n",
    "\n",
    "    variable_types = {}\n",
    "\n",
    "    for col in id_df.columns:\n",
    "        variable_types[col] = ft.variable_types.Id\n",
    "\n",
    "    es = ft.EntitySet()\n",
    "    es.entity_from_dataframe(entity_id=\"entity\",\n",
    "                             dataframe=df,\n",
    "                             index=\"index\",\n",
    "                             variable_types=variable_types,\n",
    "                             make_index=True)\n",
    "    es.to_csv(\"groupby_benchmark\", compression=\"gzip\")\n",
    "    \n",
    "es"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fl = ft.dfs(target_entity=\"entity\",\n",
    "       entityset=es,\n",
    "       groupby_trans_primitives=[\"cum_sum\", \"cum_max\", \"cum_min\"],\n",
    "       max_depth=1,\n",
    "       features_only=True,\n",
    "       verbose=True)\n",
    "\n",
    "len(fl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fm = ft.calculate_feature_matrix(entityset=es,\n",
    "                                 features=fl,\n",
    "                                 chunk_size=num_rows,\n",
    "                                 verbose=True)\n",
    "fm.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_NAME = \"by_features_2.csv\"\n",
    "fm.to_csv(TEST_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check that it gives same results\n",
    "truth = pd.read_csv(\"master.csv\")\n",
    "test = pd.read_csv(TEST_NAME)[truth.columns]\n",
    "\n",
    "truth.equals(test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
